In [25]:
import pandas as pd
import plotly.express as px
!pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
Requirement already satisfied: scikit-learn in c:\users\ealija\anaconda3\lib\site-packages (1.0.2)
Requirement already satisfied: scipy>=1.1.0 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.7.3)
Requirement already satisfied: numpy>=1.14.6 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.21.5)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: joblib>=0.11 in c:\users\ealija\anaconda3\lib\site-packages (from scikit-learn) (1.1.0)
In [26]:
# Load the data set
df = pd.read_csv("C:/Users/Ealija/Downloads/IMBD_TOP_1000.csv")
In [27]:
# Pie chart to show the distribution of movies by genre
genre_count = df["Genre"].value_counts().reset_index()
genre_count.columns = ["Genre", "Count"]
fig = px.pie(genre_count, values="Count", names="Genre", title="Distribution of Movies by Genre")
fig.show()
In [28]:
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
In [51]:
# Scatter plot to show the relationship between runtime and gross
fig = px.scatter(df, x="Runtime (min)", y="Gross", title="Relationship Between Runtime and Gross", trendline="ols")
fig.show()
In [30]:
# Bar plot to show the total gross by director
director_gross = df.groupby("Director", as_index=False)["Gross"].sum().sort_values(by="Gross", ascending=False)[:10]
fig = px.bar(director_gross, x="Director", y="Gross", title="Total Gross by Director")
fig.show()
In [31]:
# Line plot to show the trend of gross over the years
gross_year = df.groupby("Released_Year", as_index=False)["Gross"].sum()
fig = px.line(gross_year, x="Released_Year", y="Gross", title="Trend of Gross over the Years")
fig.show()
In [32]:
# Visualization for Runtime vs Meta_score
fig = px.scatter(df, x="Runtime (min)", y="Meta_score", title="Relationship Between Runtime and Meta_score")
fig.show()
In [33]:
# Box plot to show the distribution of IMDB ratings by certificate
fig = px.box(df, x="Certificate", y="IMDB_Rating", title="Distribution of IMDB Ratings by Certificate")
fig.show()
In [52]:
# Group the data by genre and calculate the total gross and average runtime for each genre
genre_stats = df.groupby("Genre").agg({"Gross": "sum", "Runtime (min)": "mean"}).reset_index()
In [53]:
# Group the data by director and calculate the average IMDB rating for each director
director_rating = df.groupby("Director").agg({"IMDB_Rating": "mean"}).reset_index()
In [54]:
# Display the resulting data frames
print(genre_stats)
print(director_rating)
         Genre         Gross  Runtime (min)
0      Action   2.212587e+10     129.046512
1   Adventure   6.022137e+09     134.111111
2   Animation   9.594346e+09      99.585366
3   Biography   5.362483e+09     136.022727
4       Comedy  6.200388e+08      95.000000
5      Comedy   5.381711e+09     113.697183
6       Crime   4.132271e+09     126.392523
7        Drama  2.701246e+09     127.211765
8       Drama   1.009115e+10     123.705882
9      Family   4.391106e+08     107.500000
10    Fantasy   1.360695e+08      85.000000
11  Film-Noir   7.059200e+07     104.000000
12      Horror  2.355221e+08     111.000000
13     Horror   5.683703e+08     100.111111
14    Mystery   4.780601e+08     119.083333
15    Thriller  1.755074e+07     108.000000
16     Western  5.822151e+07     148.250000
                Director  IMDB_Rating
0             Aamir Khan         8.40
1           Aaron Sorkin         7.80
2    Abdellatif Kechiche         7.70
3       Abhishek Chaubey         7.80
4        Abhishek Kapoor         7.70
..                   ...          ...
543          Zack Snyder         7.60
544       Zaza Urushadze         8.20
545          Zoya Akhtar         8.05
546          Çagan Irmak         8.30
547     Ömer Faruk Sorak         8.00

[548 rows x 2 columns]
In [55]:
# Select desired columns and create new dataframe
df1 = df[["Series_Title", "IMDB_Rating", "Gross"]]
In [56]:
# Create a new dataframe with the top 10 highest grossing movies
df2 = df1.nlargest(10, "Gross")
In [57]:
# Perform a left join to merge the two dataframes on the "Series_Title" column
merged_df = pd.merge(df1, df2, on="Series_Title", how="left")
In [58]:
print(merged_df.head())
               Series_Title  IMDB_Rating_x      Gross_x  IMDB_Rating_y  \
0  The Shawshank Redemption            9.3   28341469.0            NaN   
1             The Godfather            9.2  134966411.0            NaN   
2           The Dark Knight            9.0  534858444.0            9.0   
3    The Godfather: Part II            9.0   57300000.0            NaN   
4              12 Angry Men            9.0    4360000.0            NaN   

       Gross_y  
0          NaN  
1          NaN  
2  534858444.0  
3          NaN  
4          NaN  
In [59]:
#This code reads in the CSV file and creates a new dataframe with only the "Series_Title", "IMDB_Rating", and "Gross" columns. It then creates another dataframe with the top 10 highest grossing movies based on the "Gross" column. Finally, it performs a left join to merge the two dataframes on the "Series_Title" column, keeping all the rows from the original dataframe and filling in missing values from the top 10 dataframe with NaN values.

#This join can give helpful statistics to the cinema on the relationship between a movie's IMDB rating and its gross earnings.
In [60]:
#Added mean values to missing data points in GRoss
mean_gross = df["Gross"].mean()
df["Gross"].fillna(mean_gross, inplace=True)
In [61]:
#Added mean values to missing data points in meta_score

df["Meta_score"].fillna(df["Meta_score"].mean(), inplace=True)
In [62]:
# Define the independent variables and the dependent variable
X = df[["Runtime (min)", "IMDB_Rating", "Meta_score"]]
y = df["Gross"]
In [63]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [64]:
# Create a linear regression model
model = LinearRegression()
In [65]:
# Train the model on the training data
model.fit(X_train, y_train)
Out[65]:
LinearRegression()
In [66]:
# Make predictions on the testing data
y_pred = model.predict(X_test)
In [49]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)
Mean squared error: 1.0227470671185832e+16
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: